Linear regression with gluon


In [1]:
from __future__ import print_function
from tqdm import tqdm
import mxnet as mx
from mxnet import gluon

In [2]:
# Set context
data_ctx = mx.cpu()
model_ctx = mx.cpu()

In [3]:
num_inputs = 2
num_outputs = 1
num_examples = 10000

In [4]:
W1_real = 2.0
W2_real = -3.4
b_real = 4.2

In [5]:
def real_fn(X):
    return W1_real * X[:, 0] + W2_real * X[:, 1] + b_real

In [6]:
X = mx.nd.random_normal(shape=(num_examples, num_inputs))
noise = 0.01 * mx.nd.random_normal(shape=(num_examples,))
y = real_fn(X) + noise

In [7]:
# Data iterator
batch_size = 4
train_data = gluon.data.DataLoader(gluon.data.ArrayDataset(X, y),
                                   batch_size=batch_size, 
                                   shuffle=True)

In [8]:
# Define the model
net = gluon.nn.Dense(in_units=2, units=1)

In [9]:
print(net.weight)
print(net.bias)


Parameter dense0_weight (shape=(1, 2), dtype=float32)
Parameter dense0_bias (shape=(1,), dtype=float32)

Collecting parameters


In [10]:
# By default, weights and biases are of type Parameter.
print(type(net.weight))
print(type(net.bias))


<class 'mxnet.gluon.parameter.Parameter'>
<class 'mxnet.gluon.parameter.Parameter'>

In [11]:
# After collecting the parameters, we obtain ParameterDict(ionary)
net.collect_params()


Out[11]:
dense0_ (
  Parameter dense0_weight (shape=(1, 2), dtype=float32)
  Parameter dense0_bias (shape=(1,), dtype=float32)
)

In [12]:
type(net.collect_params())


Out[12]:
mxnet.gluon.parameter.ParameterDict

Initialize parameters


In [13]:
# Initialize with standard normal distribution
net.collect_params().initialize(mx.init.Normal(sigma=1.0), ctx=model_ctx)

Accessing parameters of the network


In [14]:
print(net.weight.data())
print(net.bias.data())


[[0.93444026 0.5380863 ]]
<NDArray 1x2 @cpu(0)>

[0.]
<NDArray 1 @cpu(0)>

Passing the data to the model


In [15]:
example_data = mx.nd.array([[4,7]])

In [16]:
# "Prediction"
net(example_data)


Out[16]:
[[7.504365]]
<NDArray 1x1 @cpu(0)>

Defining the network (again)


In [17]:
# It is not crucial to define in_units
# The parameters will be initialized when the data flows through it for the first time.
net = gluon.nn.Dense(units=1)
net.collect_params().initialize(mx.init.Normal(sigma=1.), ctx=model_ctx)

Defining the loss


In [18]:
# Defining the loss as squared error
square_loss = gluon.loss.L2Loss()

Defining the optimizer


In [19]:
trainer = gluon.Trainer(params=net.collect_params(), 
                        optimizer='sgd', 
                        optimizer_params={'learning_rate': 0.0001})

Training loop


In [20]:
epochs = 20
loss_sequence = []
num_batches = num_examples / batch_size

for e in range(epochs):
    cumulative_loss = 0
    # Iterating over the batches
    for i, (data, label) in tqdm(enumerate(train_data)):
        data = data.as_in_context(model_ctx)
        label = label.as_in_context(model_ctx)
        with mx.autograd.record():
            output = net(data)
            loss = square_loss(output, label)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += mx.nd.mean(loss).asscalar()
    print("Epoch %s, loss: %s" % (e, cumulative_loss / num_examples))
    loss_sequence.append(cumulative_loss)


2500it [00:05, 469.05it/s]
Epoch 0, loss: 4.044808251080662
2500it [00:04, 516.15it/s]
Epoch 1, loss: 2.4210833267018197
2500it [00:04, 569.99it/s]
Epoch 2, loss: 1.449182435182482
2500it [00:04, 539.26it/s]
Epoch 3, loss: 0.8674455086305738
2500it [00:05, 477.87it/s]
Epoch 4, loss: 0.5192347130089998
2500it [00:04, 546.39it/s]
Epoch 5, loss: 0.31080911717228593
2500it [00:04, 543.77it/s]
Epoch 6, loss: 0.18604981883727015
2500it [00:04, 544.60it/s]
Epoch 7, loss: 0.11137253744467161
2500it [00:04, 550.24it/s]
Epoch 8, loss: 0.06667259070305154
2500it [00:04, 555.24it/s]
Epoch 9, loss: 0.03991518984034192
2500it [00:04, 535.74it/s]
Epoch 10, loss: 0.02389857202839339
2500it [00:04, 557.72it/s]
Epoch 11, loss: 0.014310872454661876
2500it [00:04, 562.10it/s]
Epoch 12, loss: 0.008571591195501969
2500it [00:04, 549.72it/s]
Epoch 13, loss: 0.005135956158919725
2500it [00:04, 556.33it/s]
Epoch 14, loss: 0.0030793819638696733
2500it [00:04, 505.95it/s]
Epoch 15, loss: 0.0018482797156397282
2500it [00:04, 547.82it/s]
Epoch 16, loss: 0.00111145593189176
2500it [00:04, 579.36it/s]
Epoch 17, loss: 0.0006702495257908595
2500it [00:04, 533.51it/s]
Epoch 18, loss: 0.00040625386701976824
2500it [00:04, 560.46it/s]
Epoch 19, loss: 0.00024821289024657745

Getting the parameters


In [21]:
# ParameterDict
params = net.collect_params()

In [22]:
for param in params.values():
    print(param.name, param.data())


dense1_weight 
[[ 1.9863372 -3.3749702]]
<NDArray 1x2 @cpu(0)>
dense1_bias 
[4.175586]
<NDArray 1 @cpu(0)>

In [23]:
list(params.values())[0].data()


Out[23]:
[[ 1.9863372 -3.3749702]]
<NDArray 1x2 @cpu(0)>

In [24]:
[W1, W2] = list(params.values())[0].data()[0]

In [25]:
b = list(params.values())[1].data()[0]

In [26]:
# Learned values
print(W1_real)
print(W2_real)
print(b_real)


2.0
-3.4
4.2

In [27]:
# True values
print(W1.asscalar())
print(W2.asscalar())
print(b.asscalar())


1.9863372
-3.3749702
4.175586